1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57 package org.apache.lucene.analysis.en;
58
59 import org.apache.lucene.analysis.util.CharArrayMap;
60 import org.apache.lucene.analysis.util.OpenStringBuilder;
61
62
63
64
65
66
67
68
69
70
71 public class KStemmer {
72 static private final int MaxWordLen = 50;
73
74 static private final String[] exceptionWords = {"aide", "bathe", "caste",
75 "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
76 "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
77 "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
78 "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
79 "swinge", "tare", "tine", "tope", "tripe", "twine"};
80
81 static private final String[][] directConflations = { {"aging", "age"},
82 {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
83 {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
84 {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
85 {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
86 {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
87 {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
88 {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
89 {"political", "politics"}, {"metaphysical", "metaphysics"},
90 {"cylindrical", "cylinder"}, {"nazism", "nazi"},
91 {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
92 {"credulity", "credulous"}, {"generosity", "generous"},
93 {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
94 {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
95
96 static private final String[][] countryNationality = {
97 {"afghan", "afghanistan"}, {"african", "africa"},
98 {"albanian", "albania"}, {"algerian", "algeria"},
99 {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
100 {"arabian", "arabia"}, {"argentine", "argentina"},
101 {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
102 {"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
103 {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
104 {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
105 {"bosnian", "bosnia"}, {"botswanan", "botswana"},
106 {"brazilian", "brazil"}, {"british", "britain"},
107 {"bulgarian", "bulgaria"}, {"burmese", "burma"},
108 {"californian", "california"}, {"cambodian", "cambodia"},
109 {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
110 {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
111 {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
112 {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
113 {"egyptian", "egypt"}, {"equadorian", "equador"},
114 {"eritrean", "eritrea"}, {"estonian", "estonia"},
115 {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
116 {"filipino", "philippines"}, {"finnish", "finland"},
117 {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
118 {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
119 {"grenadan", "grenada"}, {"guamian", "guam"},
120 {"guatemalan", "guatemala"}, {"guinean", "guinea"},
121 {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
122 {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
123 {"icelandic", "iceland"}, {"indonesian", "indonesia"},
124 {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
125 {"irish", "ireland"}, {"israeli", "israel"},
126 {"italian", "italy"},
127 {"jamaican", "jamaica"},
128 {"japanese", "japan"},
129 {"jordanian", "jordan"},
130 {"kampuchean", "cambodia"},
131 {"kenyan", "kenya"},
132 {"korean", "korea"},
133 {"kuwaiti", "kuwait"},
134 {"lankan", "lanka"},
135 {"laotian", "laos"},
136 {"latvian", "latvia"},
137 {"lebanese", "lebanon"},
138 {"liberian", "liberia"},
139 {"libyan", "libya"},
140 {"lithuanian", "lithuania"},
141 {"macedonian", "macedonia"},
142 {"madagascan", "madagascar"},
143 {"malaysian", "malaysia"},
144 {"maltese", "malta"},
145 {"mauritanian", "mauritania"},
146 {"mexican", "mexico"},
147 {"micronesian", "micronesia"},
148 {"moldovan", "moldova"},
149 {"monacan", "monaco"},
150 {"mongolian", "mongolia"},
151 {"montenegran", "montenegro"},
152 {"moroccan", "morocco"},
153 {"myanmar", "burma"},
154 {"namibian", "namibia"},
155 {"nepalese", "nepal"},
156
157 {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
158 {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
159 {"panamanian", "panama"}, {"papuan", "papua"},
160 {"paraguayan", "paraguay"}, {"peruvian", "peru"},
161 {"portuguese", "portugal"}, {"romanian", "romania"},
162 {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
163 {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
164 {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
165 {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
166 {"slovenian", "slovenia"}, {"somali", "somalia"},
167 {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
168 {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
169 {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
170 {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
171 {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
172 {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
173 {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
174 {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
175 {"zambian", "zambia"}, {"zealander", "zealand"},
176 {"zimbabwean", "zimbabwe"}};
177
178 static private final String[] supplementDict = {"aids", "applicator",
179 "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
180 "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
181 "toroid", "superconduct", "backscatter", "connectionism"};
182
183 static private final String[] properNouns = {"abrams", "achilles",
184 "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
185 "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
186 "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
187 "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
188 "barbados", "barger", "bering", "brahms", "brandeis", "brussels",
189 "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
190 "charles", "cheops", "ching", "christmas", "cocos", "collins",
191 "columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
192 "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
193 "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
194 "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
195 "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
196 "everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
197 "fleming", "forbes", "fowler", "france", "francis", "goering",
198 "goodling", "goths", "grenadines", "guiness", "hades", "harding",
199 "harris", "hastings", "hawkes", "hawking", "hayes", "heights",
200 "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
201 "hopkins", "hughes", "humphreys", "illinois", "indianapolis",
202 "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
203 "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
204 "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
205 "levis", "leviticus", "lewis", "louis", "maccabees", "madras",
206 "maimonides", "maldive", "massachusetts", "matthews", "mauritius",
207 "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
208 "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
209 "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
210 "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
211 "pauling", "peking", "pershing", "peter", "peters", "philippines",
212 "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
213 "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
214 "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
215 "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
216 "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
217 "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
218 "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
219 "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
220 "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
221 "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
222 "connally", "conner", "coors", "cummings", "curtis", "daedalus",
223 "dionysus", "dobbs", "dolores", "edmonds"};
224
225 static class DictEntry {
226 boolean exception;
227 String root;
228
229 DictEntry(String root, boolean isException) {
230 this.root = root;
231 this.exception = isException;
232 }
233 }
234
235 private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
236
237
238
239
240
241
242
243 private final OpenStringBuilder word = new OpenStringBuilder();
244 private int j;
245 private int k;
246
247
248
249
250
251
252
253
254
255
256 private char finalChar() {
257 return word.charAt(k);
258 }
259
260 private char penultChar() {
261 return word.charAt(k - 1);
262 }
263
264 private boolean isVowel(int index) {
265 return !isCons(index);
266 }
267
268 private boolean isCons(int index) {
269 char ch;
270
271 ch = word.charAt(index);
272
273 if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
274 if ((ch != 'y') || (index == 0)) return true;
275 else return (!isCons(index - 1));
276 }
277
278 private static CharArrayMap<DictEntry> initializeDictHash() {
279 DictEntry defaultEntry;
280 DictEntry entry;
281
282 CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false);
283 for (int i = 0; i < exceptionWords.length; i++) {
284 if (!d.containsKey(exceptionWords[i])) {
285 entry = new DictEntry(exceptionWords[i], true);
286 d.put(exceptionWords[i], entry);
287 } else {
288 throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
289 + "] already in dictionary 1");
290 }
291 }
292
293 for (int i = 0; i < directConflations.length; i++) {
294 if (!d.containsKey(directConflations[i][0])) {
295 entry = new DictEntry(directConflations[i][1], false);
296 d.put(directConflations[i][0], entry);
297 } else {
298 throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
299 + "] already in dictionary 2");
300 }
301 }
302
303 for (int i = 0; i < countryNationality.length; i++) {
304 if (!d.containsKey(countryNationality[i][0])) {
305 entry = new DictEntry(countryNationality[i][1], false);
306 d.put(countryNationality[i][0], entry);
307 } else {
308 throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
309 + "] already in dictionary 3");
310 }
311 }
312
313 defaultEntry = new DictEntry(null, false);
314
315 String[] array;
316 array = KStemData1.data;
317
318 for (int i = 0; i < array.length; i++) {
319 if (!d.containsKey(array[i])) {
320 d.put(array[i], defaultEntry);
321 } else {
322 throw new RuntimeException("Warning: Entry [" + array[i]
323 + "] already in dictionary 4");
324 }
325 }
326
327 array = KStemData2.data;
328 for (int i = 0; i < array.length; i++) {
329 if (!d.containsKey(array[i])) {
330 d.put(array[i], defaultEntry);
331 } else {
332 throw new RuntimeException("Warning: Entry [" + array[i]
333 + "] already in dictionary 4");
334 }
335 }
336
337 array = KStemData3.data;
338 for (int i = 0; i < array.length; i++) {
339 if (!d.containsKey(array[i])) {
340 d.put(array[i], defaultEntry);
341 } else {
342 throw new RuntimeException("Warning: Entry [" + array[i]
343 + "] already in dictionary 4");
344 }
345 }
346
347 array = KStemData4.data;
348 for (int i = 0; i < array.length; i++) {
349 if (!d.containsKey(array[i])) {
350 d.put(array[i], defaultEntry);
351 } else {
352 throw new RuntimeException("Warning: Entry [" + array[i]
353 + "] already in dictionary 4");
354 }
355 }
356
357 array = KStemData5.data;
358 for (int i = 0; i < array.length; i++) {
359 if (!d.containsKey(array[i])) {
360 d.put(array[i], defaultEntry);
361 } else {
362 throw new RuntimeException("Warning: Entry [" + array[i]
363 + "] already in dictionary 4");
364 }
365 }
366
367 array = KStemData6.data;
368 for (int i = 0; i < array.length; i++) {
369 if (!d.containsKey(array[i])) {
370 d.put(array[i], defaultEntry);
371 } else {
372 throw new RuntimeException("Warning: Entry [" + array[i]
373 + "] already in dictionary 4");
374 }
375 }
376
377 array = KStemData7.data;
378 for (int i = 0; i < array.length; i++) {
379 if (!d.containsKey(array[i])) {
380 d.put(array[i], defaultEntry);
381 } else {
382 throw new RuntimeException("Warning: Entry [" + array[i]
383 + "] already in dictionary 4");
384 }
385 }
386
387 for (int i = 0; i < KStemData8.data.length; i++) {
388 if (!d.containsKey(KStemData8.data[i])) {
389 d.put(KStemData8.data[i], defaultEntry);
390 } else {
391 throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
392 + "] already in dictionary 4");
393 }
394 }
395
396 for (int i = 0; i < supplementDict.length; i++) {
397 if (!d.containsKey(supplementDict[i])) {
398 d.put(supplementDict[i], defaultEntry);
399 } else {
400 throw new RuntimeException("Warning: Entry [" + supplementDict[i]
401 + "] already in dictionary 5");
402 }
403 }
404
405 for (int i = 0; i < properNouns.length; i++) {
406 if (!d.containsKey(properNouns[i])) {
407 d.put(properNouns[i], defaultEntry);
408 } else {
409 throw new RuntimeException("Warning: Entry [" + properNouns[i]
410 + "] already in dictionary 6");
411 }
412 }
413
414 return d;
415 }
416
417 private boolean isAlpha(char ch) {
418 return ch >= 'a' && ch <= 'z';
419 }
420
421
422 private int stemLength() {
423 return j + 1;
424 };
425
426 private boolean endsIn(char[] s) {
427 if (s.length > k) return false;
428
429 int r = word.length() - s.length;
430 j = k;
431 for (int r1 = r, i = 0; i < s.length; i++, r1++) {
432 if (s[i] != word.charAt(r1)) return false;
433 }
434 j = r - 1;
435 return true;
436 }
437
438 private boolean endsIn(char a, char b) {
439 if (2 > k) return false;
440
441 if (word.charAt(k - 1) == a && word.charAt(k) == b) {
442 j = k - 2;
443 return true;
444 }
445 return false;
446 }
447
448 private boolean endsIn(char a, char b, char c) {
449 if (3 > k) return false;
450 if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
451 && word.charAt(k) == c) {
452 j = k - 3;
453 return true;
454 }
455 return false;
456 }
457
458 private boolean endsIn(char a, char b, char c, char d) {
459 if (4 > k) return false;
460 if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
461 && word.charAt(k - 1) == c && word.charAt(k) == d) {
462 j = k - 4;
463 return true;
464 }
465 return false;
466 }
467
468 private DictEntry wordInDict() {
469
470
471
472
473
474
475 if (matchedEntry != null) return matchedEntry;
476 DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
477 if (e != null && !e.exception) {
478 matchedEntry = e;
479 }
480
481 return e;
482 }
483
484
485 private void plural() {
486 if (word.charAt(k) == 's') {
487 if (endsIn('i', 'e', 's')) {
488 word.setLength(j + 3);
489 k--;
490 if (lookup())
491 return;
492 k++;
493 word.unsafeWrite('s');
494 setSuffix("y");
495 lookup();
496 } else if (endsIn('e', 's')) {
497
498 word.setLength(j + 2);
499 k--;
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514 boolean tryE = j > 0
515 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
516 if (tryE && lookup()) return;
517
518
519
520 word.setLength(j + 1);
521 k--;
522 if (lookup()) return;
523
524
525 word.unsafeWrite('e');
526 k++;
527
528 if (!tryE) lookup();
529 return;
530 } else {
531 if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
532
533
534 word.setLength(k);
535 k--;
536 lookup();
537 }
538 }
539 }
540 }
541
542 private void setSuffix(String s) {
543 setSuff(s, s.length());
544 }
545
546
547 private void setSuff(String s, int len) {
548 word.setLength(j + 1);
549 for (int l = 0; l < len; l++) {
550 word.unsafeWrite(s.charAt(l));
551 }
552 k = j + len;
553 }
554
555
556
557
558
559 DictEntry matchedEntry = null;
560
561 private boolean lookup() {
562
563
564
565
566
567
568
569
570
571
572 matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
573 return matchedEntry != null;
574 }
575
576
577
578
579 private void pastTense() {
580
581
582
583
584 if (word.length() <= 4) return;
585
586 if (endsIn('i', 'e', 'd')) {
587 word.setLength(j + 3);
588 k--;
589 if (lookup())
590 return;
591 k++;
592 word.unsafeWrite('d');
593 setSuffix("y");
594 lookup();
595 return;
596 }
597
598
599 if (endsIn('e', 'd') && vowelInStem()) {
600
601 word.setLength(j + 2);
602 k = j + 1;
603
604 DictEntry entry = wordInDict();
605 if (entry != null) if (!entry.exception)
606
607
608
609 return;
610
611
612 word.setLength(j + 1);
613 k = j;
614 if (lookup()) return;
615
616
617
618
619
620
621
622
623 if (doubleC(k)) {
624 word.setLength(k);
625 k--;
626 if (lookup()) return;
627 word.unsafeWrite(word.charAt(k));
628 k++;
629 lookup();
630 return;
631 }
632
633
634
635
636
637 if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
638 word.unsafeWrite('e');
639 word.unsafeWrite('d');
640 k = k + 2;
641
642 return;
643 }
644
645
646
647
648
649
650 word.setLength(j + 1);
651 word.unsafeWrite('e');
652 k = j + 1;
653
654 return;
655 }
656 }
657
658
659 private boolean doubleC(int i) {
660 if (i < 1) return false;
661
662 if (word.charAt(i) != word.charAt(i - 1)) return false;
663 return (isCons(i));
664 }
665
666 private boolean vowelInStem() {
667 for (int i = 0; i < stemLength(); i++) {
668 if (isVowel(i)) return true;
669 }
670 return false;
671 }
672
673
674 private void aspect() {
675
676
677
678
679
680
681
682 if (word.length() <= 5) return;
683
684
685 if (endsIn('i', 'n', 'g') && vowelInStem()) {
686
687
688 word.setCharAt(j + 1, 'e');
689 word.setLength(j + 2);
690 k = j + 1;
691
692 DictEntry entry = wordInDict();
693 if (entry != null) {
694 if (!entry.exception)
695 return;
696 }
697
698
699 word.setLength(k);
700 k--;
701
702 if (lookup()) return;
703
704
705 if (doubleC(k)) {
706 k--;
707 word.setLength(k + 1);
708 if (lookup()) return;
709 word.unsafeWrite(word.charAt(k));
710
711
712
713
714
715
716 k++;
717 lookup();
718 return;
719 }
720
721
722
723
724
725
726
727
728
729
730
731
732
733 if ((j > 0) && isCons(j) && isCons(j - 1)) {
734 k = j;
735 word.setLength(k + 1);
736
737 return;
738 }
739
740 word.setLength(j + 1);
741 word.unsafeWrite('e');
742 k = j + 1;
743
744 return;
745 }
746 }
747
748
749
750
751
752
753
754 private void ityEndings() {
755 int old_k = k;
756
757 if (endsIn('i', 't', 'y')) {
758 word.setLength(j + 1);
759 k = j;
760 if (lookup()) return;
761 word.unsafeWrite('e');
762 k = j + 1;
763 if (lookup()) return;
764 word.setCharAt(j + 1, 'i');
765 word.append("ty");
766 k = old_k;
767
768
769
770
771 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
772 word.setLength(j - 1);
773 word.append("le");
774 k = j;
775 lookup();
776 return;
777 }
778
779
780 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
781 word.setLength(j + 1);
782 word.unsafeWrite('e');
783 k = j + 1;
784 lookup();
785 return;
786 }
787
788 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
789 word.setLength(j + 1);
790 k = j;
791 lookup();
792 return;
793 }
794
795
796
797
798
799
800
801
802 if (lookup()) return;
803
804
805 word.setLength(j + 1);
806 k = j;
807
808 return;
809 }
810 }
811
812
813 private void nceEndings() {
814 int old_k = k;
815 char word_char;
816
817 if (endsIn('n', 'c', 'e')) {
818 word_char = word.charAt(j);
819 if (!((word_char == 'e') || (word_char == 'a'))) return;
820 word.setLength(j);
821 word.unsafeWrite('e');
822 k = j;
823 if (lookup()) return;
824 word.setLength(j);
825
826
827
828 k = j - 1;
829 if (lookup()) return;
830 word.unsafeWrite(word_char);
831 word.append("nce");
832 k = old_k;
833
834 }
835 return;
836 }
837
838
839 private void nessEndings() {
840 if (endsIn('n', 'e', 's', 's')) {
841
842
843
844 word.setLength(j + 1);
845 k = j;
846 if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
847 lookup();
848 }
849 return;
850 }
851
852
853 private void ismEndings() {
854 if (endsIn('i', 's', 'm')) {
855
856
857
858 word.setLength(j + 1);
859 k = j;
860 lookup();
861 }
862 return;
863 }
864
865
866 private void mentEndings() {
867 int old_k = k;
868
869 if (endsIn('m', 'e', 'n', 't')) {
870 word.setLength(j + 1);
871 k = j;
872 if (lookup()) return;
873 word.append("ment");
874 k = old_k;
875
876 }
877 return;
878 }
879
880
881 private void izeEndings() {
882 int old_k = k;
883
884 if (endsIn('i', 'z', 'e')) {
885 word.setLength(j + 1);
886 k = j;
887 if (lookup()) return;
888 word.unsafeWrite('i');
889
890 if (doubleC(j)) {
891 word.setLength(j);
892 k = j - 1;
893 if (lookup()) return;
894 word.unsafeWrite(word.charAt(j - 1));
895 }
896
897 word.setLength(j + 1);
898 word.unsafeWrite('e');
899 k = j + 1;
900 if (lookup()) return;
901 word.setLength(j + 1);
902 word.append("ize");
903 k = old_k;
904
905 }
906 return;
907 }
908
909
910 private void ncyEndings() {
911 if (endsIn('n', 'c', 'y')) {
912 if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
913 word.setCharAt(j + 2, 't');
914 word.setLength(j + 3);
915 k = j + 2;
916
917 if (lookup()) return;
918
919 word.setCharAt(j + 2, 'c');
920 word.unsafeWrite('e');
921 k = j + 3;
922 lookup();
923 }
924 return;
925 }
926
927
928 private void bleEndings() {
929 int old_k = k;
930 char word_char;
931
932 if (endsIn('b', 'l', 'e')) {
933 if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
934 word_char = word.charAt(j);
935 word.setLength(j);
936 k = j - 1;
937 if (lookup()) return;
938 if (doubleC(k)) {
939 word.setLength(k);
940 k--;
941 if (lookup()) return;
942 k++;
943 word.unsafeWrite(word.charAt(k - 1));
944 }
945 word.setLength(j);
946 word.unsafeWrite('e');
947 k = j;
948 if (lookup()) return;
949 word.setLength(j);
950 word.append("ate");
951
952 k = j + 2;
953 if (lookup()) return;
954 word.setLength(j);
955 word.unsafeWrite(word_char);
956 word.append("ble");
957 k = old_k;
958
959 }
960 return;
961 }
962
963
964
965
966
967
968 private void icEndings() {
969 if (endsIn('i', 'c')) {
970 word.setLength(j + 3);
971 word.append("al");
972 k = j + 4;
973 if (lookup()) return;
974
975 word.setCharAt(j + 1, 'y');
976 word.setLength(j + 2);
977 k = j + 1;
978 if (lookup()) return;
979
980 word.setCharAt(j + 1, 'e');
981 if (lookup()) return;
982
983 word.setLength(j + 1);
984 k = j;
985 if (lookup()) return;
986 word.append("ic");
987 k = j + 2;
988
989 }
990 return;
991 }
992
993 private static char[] ization = "ization".toCharArray();
994 private static char[] ition = "ition".toCharArray();
995 private static char[] ation = "ation".toCharArray();
996 private static char[] ication = "ication".toCharArray();
997
998
999
1000
1001
1002
1003 private void ionEndings() {
1004 int old_k = k;
1005 if (!endsIn('i', 'o', 'n')) {
1006 return;
1007 }
1008
1009 if (endsIn(ization)) {
1010
1011
1012
1013 word.setLength(j + 3);
1014 word.unsafeWrite('e');
1015 k = j + 3;
1016 lookup();
1017 return;
1018 }
1019
1020 if (endsIn(ition)) {
1021 word.setLength(j + 1);
1022 word.unsafeWrite('e');
1023 k = j + 1;
1024 if (lookup())
1025
1026
1027
1028 return;
1029
1030
1031 word.setLength(j + 1);
1032 word.append("ition");
1033 k = old_k;
1034
1035 } else if (endsIn(ation)) {
1036 word.setLength(j + 3);
1037 word.unsafeWrite('e');
1038 k = j + 3;
1039 if (lookup())
1040 return;
1041
1042 word.setLength(j + 1);
1043 word.unsafeWrite('e');
1044
1045
1046
1047 k = j + 1;
1048 if (lookup()) return;
1049
1050 word.setLength(j + 1);
1051
1052
1053
1054 k = j;
1055 if (lookup()) return;
1056
1057
1058 word.setLength(j + 1);
1059 word.append("ation");
1060 k = old_k;
1061
1062
1063 }
1064
1065
1066
1067
1068
1069
1070 if (endsIn(ication)) {
1071 word.setLength(j + 1);
1072 word.unsafeWrite('y');
1073 k = j + 1;
1074 if (lookup())
1075
1076
1077
1078 return;
1079
1080
1081 word.setLength(j + 1);
1082 word.append("ication");
1083 k = old_k;
1084
1085 }
1086
1087
1088 if (true) {
1089 j = k - 3;
1090
1091 word.setLength(j + 1);
1092 word.unsafeWrite('e');
1093 k = j + 1;
1094 if (lookup())
1095 return;
1096
1097 word.setLength(j + 1);
1098 k = j;
1099 if (lookup())
1100 return;
1101
1102
1103 word.setLength(j + 1);
1104 word.append("ion");
1105 k = old_k;
1106
1107 }
1108
1109
1110 return;
1111 }
1112
1113
1114
1115
1116
1117 private void erAndOrEndings() {
1118 int old_k = k;
1119
1120 if (word.charAt(k) != 'r') return;
1121
1122 char word_char;
1123
1124 if (endsIn('i', 'z', 'e', 'r')) {
1125
1126
1127
1128 word.setLength(j + 4);
1129 k = j + 3;
1130 lookup();
1131 return;
1132 }
1133
1134 if (endsIn('e', 'r') || endsIn('o', 'r')) {
1135 word_char = word.charAt(j + 1);
1136 if (doubleC(j)) {
1137 word.setLength(j);
1138 k = j - 1;
1139 if (lookup()) return;
1140 word.unsafeWrite(word.charAt(j - 1));
1141 }
1142
1143 if (word.charAt(j) == 'i') {
1144 word.setCharAt(j, 'y');
1145 word.setLength(j + 1);
1146 k = j;
1147 if (lookup())
1148 return;
1149 word.setCharAt(j, 'i');
1150 word.unsafeWrite('e');
1151 }
1152
1153 if (word.charAt(j) == 'e') {
1154 word.setLength(j);
1155 k = j - 1;
1156 if (lookup()) return;
1157 word.unsafeWrite('e');
1158 }
1159
1160 word.setLength(j + 2);
1161 k = j + 1;
1162 if (lookup()) return;
1163 word.setLength(j + 1);
1164 k = j;
1165 if (lookup()) return;
1166 word.unsafeWrite('e');
1167 k = j + 1;
1168 if (lookup()) return;
1169 word.setLength(j + 1);
1170 word.unsafeWrite(word_char);
1171 word.unsafeWrite('r');
1172 k = old_k;
1173
1174 }
1175
1176 }
1177
1178
1179
1180
1181
1182
1183
1184 private void lyEndings() {
1185 int old_k = k;
1186
1187 if (endsIn('l', 'y')) {
1188
1189 word.setCharAt(j + 2, 'e');
1190
1191 if (lookup()) return;
1192 word.setCharAt(j + 2, 'y');
1193
1194 word.setLength(j + 1);
1195 k = j;
1196
1197 if (lookup()) return;
1198
1199 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l'))
1200
1201
1202
1203
1204
1205
1206
1207
1208 return;
1209 word.append("ly");
1210 k = old_k;
1211
1212 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) {
1213
1214
1215
1216
1217
1218
1219
1220
1221 word.setCharAt(j + 2, 'e');
1222 k = j + 2;
1223 return;
1224 }
1225
1226 if (word.charAt(j) == 'i') {
1227 word.setLength(j);
1228 word.unsafeWrite('y');
1229 k = j;
1230 if (lookup()) return;
1231 word.setLength(j);
1232 word.append("ily");
1233 k = old_k;
1234 }
1235
1236 word.setLength(j + 1);
1237
1238 k = j;
1239
1240 }
1241 return;
1242 }
1243
1244
1245
1246
1247
1248 private void alEndings() {
1249 int old_k = k;
1250
1251 if (word.length() < 4) return;
1252 if (endsIn('a', 'l')) {
1253 word.setLength(j + 1);
1254 k = j;
1255 if (lookup())
1256 return;
1257
1258 if (doubleC(j)) {
1259 word.setLength(j);
1260 k = j - 1;
1261 if (lookup()) return;
1262 word.unsafeWrite(word.charAt(j - 1));
1263 }
1264
1265 word.setLength(j + 1);
1266 word.unsafeWrite('e');
1267 k = j + 1;
1268 if (lookup()) return;
1269
1270 word.setLength(j + 1);
1271 word.append("um");
1272
1273 k = j + 2;
1274 if (lookup()) return;
1275
1276 word.setLength(j + 1);
1277 word.append("al");
1278 k = old_k;
1279
1280 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
1281 word.setLength(j - 1);
1282 k = j - 2;
1283 if (lookup()) return;
1284
1285 word.setLength(j - 1);
1286 word.unsafeWrite('y');
1287 k = j - 1;
1288 if (lookup()) return;
1289
1290 word.setLength(j - 1);
1291 word.append("ic");
1292 k = j;
1293
1294
1295
1296 lookup();
1297 return;
1298 }
1299
1300 if (word.charAt(j) == 'i') {
1301 word.setLength(j);
1302 k = j - 1;
1303 if (lookup()) return;
1304 word.append("ial");
1305 k = old_k;
1306 lookup();
1307 }
1308
1309 }
1310 return;
1311 }
1312
1313
1314
1315
1316
1317 private void iveEndings() {
1318 int old_k = k;
1319
1320 if (endsIn('i', 'v', 'e')) {
1321 word.setLength(j + 1);
1322 k = j;
1323 if (lookup()) return;
1324
1325 word.unsafeWrite('e');
1326 k = j + 1;
1327 if (lookup()) return;
1328 word.setLength(j + 1);
1329 word.append("ive");
1330 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
1331 word.setCharAt(j - 1, 'e');
1332 word.setLength(j);
1333 k = j - 1;
1334 if (lookup()) return;
1335 word.setLength(j - 1);
1336 if (lookup()) return;
1337
1338 word.append("ative");
1339 k = old_k;
1340 }
1341
1342
1343 word.setCharAt(j + 2, 'o');
1344 word.setCharAt(j + 3, 'n');
1345 if (lookup()) return;
1346
1347 word.setCharAt(j + 2, 'v');
1348 word.setCharAt(j + 3, 'e');
1349 k = old_k;
1350
1351 }
1352 return;
1353 }
1354
1355 KStemmer() {}
1356
1357 String stem(String term) {
1358 boolean changed = stem(term.toCharArray(), term.length());
1359 if (!changed) return term;
1360 return asString();
1361 }
1362
1363
1364
1365
1366 String asString() {
1367 String s = getString();
1368 if (s != null) return s;
1369 return word.toString();
1370 }
1371
1372 CharSequence asCharSequence() {
1373 return result != null ? result : word;
1374 }
1375
1376 String getString() {
1377 return result;
1378 }
1379
1380 char[] getChars() {
1381 return word.getArray();
1382 }
1383
1384 int getLength() {
1385 return word.length();
1386 }
1387
1388 String result;
1389
1390 private boolean matched() {
1391
1392
1393
1394
1395
1396
1397 return matchedEntry != null;
1398 }
1399
1400
1401
1402
1403 boolean stem(char[] term, int len) {
1404
1405 result = null;
1406
1407 k = len - 1;
1408 if ((k <= 1) || (k >= MaxWordLen - 1)) {
1409 return false;
1410 }
1411
1412
1413
1414 DictEntry entry = dict_ht.get(term, 0, len);
1415 if (entry != null) {
1416 if (entry.root != null) {
1417 result = entry.root;
1418 return true;
1419 }
1420 return false;
1421 }
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431 word.reset();
1432
1433 word.reserve(len + 10);
1434 for (int i = 0; i < len; i++) {
1435 char ch = term[i];
1436 if (!isAlpha(ch)) return false;
1437
1438
1439 word.unsafeWrite(ch);
1440 }
1441
1442 matchedEntry = null;
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452 while (true) {
1453
1454
1455 plural();
1456 if (matched()) break;
1457 pastTense();
1458 if (matched()) break;
1459 aspect();
1460 if (matched()) break;
1461 ityEndings();
1462 if (matched()) break;
1463 nessEndings();
1464 if (matched()) break;
1465 ionEndings();
1466 if (matched()) break;
1467 erAndOrEndings();
1468 if (matched()) break;
1469 lyEndings();
1470 if (matched()) break;
1471 alEndings();
1472 if (matched()) break;
1473 entry = wordInDict();
1474 iveEndings();
1475 if (matched()) break;
1476 izeEndings();
1477 if (matched()) break;
1478 mentEndings();
1479 if (matched()) break;
1480 bleEndings();
1481 if (matched()) break;
1482 ismEndings();
1483 if (matched()) break;
1484 icEndings();
1485 if (matched()) break;
1486 ncyEndings();
1487 if (matched()) break;
1488 nceEndings();
1489 matched();
1490 break;
1491 }
1492
1493
1494
1495
1496
1497 entry = matchedEntry;
1498 if (entry != null) {
1499 result = entry.root;
1500 }
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518 return true;
1519 }
1520
1521 }